| Index: lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java |
| IDEA additional info: |
| Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP |
| <+>UTF-8 |
| =================================================================== |
| --- lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (revision 1645984) |
| +++ lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (revision ) |
| @@ -17,6 +17,8 @@ |
| * limitations under the License. |
| */ |
| |
| +import javax.xml.parsers.DocumentBuilder; |
| +import javax.xml.parsers.DocumentBuilderFactory; |
| import java.io.ByteArrayInputStream; |
| import java.io.IOException; |
| import java.nio.charset.StandardCharsets; |
| @@ -28,10 +30,17 @@ |
| import java.util.List; |
| import java.util.Map; |
| import java.util.StringTokenizer; |
| -import javax.xml.parsers.DocumentBuilder; |
| -import javax.xml.parsers.DocumentBuilderFactory; |
| |
| -import org.apache.lucene.analysis.*; |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| +import org.apache.lucene.analysis.CachingTokenFilter; |
| +import org.apache.lucene.analysis.MockAnalyzer; |
| +import org.apache.lucene.analysis.MockPayloadAnalyzer; |
| +import org.apache.lucene.analysis.MockTokenFilter; |
| +import org.apache.lucene.analysis.MockTokenizer; |
| +import org.apache.lucene.analysis.Token; |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.Tokenizer; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| @@ -44,20 +53,43 @@ |
| import org.apache.lucene.index.DirectoryReader; |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.IndexWriter; |
| -import org.apache.lucene.index.IndexWriterConfig.OpenMode; |
| import org.apache.lucene.index.IndexWriterConfig; |
| +import org.apache.lucene.index.IndexWriterConfig.OpenMode; |
| import org.apache.lucene.index.StoredDocument; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.queries.CommonTermsQuery; |
| -import org.apache.lucene.search.*; |
| import org.apache.lucene.search.BooleanClause.Occur; |
| +import org.apache.lucene.search.BooleanQuery; |
| +import org.apache.lucene.search.ConstantScoreQuery; |
| +import org.apache.lucene.search.FilteredQuery; |
| +import org.apache.lucene.search.FuzzyQuery; |
| +import org.apache.lucene.search.IndexSearcher; |
| +import org.apache.lucene.search.MultiPhraseQuery; |
| +import org.apache.lucene.search.MultiTermQuery; |
| +import org.apache.lucene.search.NumericRangeQuery; |
| +import org.apache.lucene.search.PhraseQuery; |
| +import org.apache.lucene.search.PrefixQuery; |
| +import org.apache.lucene.search.Query; |
| +import org.apache.lucene.search.QueryWrapperFilter; |
| +import org.apache.lucene.search.RegexpQuery; |
| +import org.apache.lucene.search.TermQuery; |
| +import org.apache.lucene.search.TermRangeFilter; |
| +import org.apache.lucene.search.TermRangeQuery; |
| +import org.apache.lucene.search.TopDocs; |
| +import org.apache.lucene.search.WildcardQuery; |
| import org.apache.lucene.search.highlight.SynonymTokenizer.TestHighlightRunner; |
| import org.apache.lucene.search.join.BitDocIdSetCachingWrapperFilter; |
| import org.apache.lucene.search.join.BitDocIdSetFilter; |
| import org.apache.lucene.search.join.ScoreMode; |
| import org.apache.lucene.search.join.ToChildBlockJoinQuery; |
| import org.apache.lucene.search.join.ToParentBlockJoinQuery; |
| -import org.apache.lucene.search.spans.*; |
| +import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; |
| +import org.apache.lucene.search.spans.SpanNearQuery; |
| +import org.apache.lucene.search.spans.SpanNotQuery; |
| +import org.apache.lucene.search.spans.SpanOrQuery; |
| +import org.apache.lucene.search.spans.SpanPayloadCheckQuery; |
| +import org.apache.lucene.search.spans.SpanQuery; |
| +import org.apache.lucene.search.spans.SpanTermQuery; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.LuceneTestCase; |
| @@ -1891,7 +1923,7 @@ |
| reader.close(); |
| } |
| |
| - /** If we have term vectors, we can highlight based on payloads */ |
| + /** We can highlight based on payloads. It's supported both via term vectors and MemoryIndex since Lucene 5. */ |
| public void testPayloadQuery() throws IOException, InvalidTokenOffsetsException { |
| final String text = "random words and words";//"words" at positions 1 & 4 |
| |
| @@ -1900,7 +1932,7 @@ |
| writer.deleteAll(); |
| Document doc = new Document(); |
| |
| - doc.add(new Field(FIELD_NAME, text, FIELD_TYPE_TV)); |
| + doc.add(new Field(FIELD_NAME, text, fieldType)); |
| writer.addDocument(doc); |
| writer.commit(); |
| } |
| @@ -1908,12 +1940,17 @@ |
| Query query = new SpanPayloadCheckQuery(new SpanTermQuery(new Term(FIELD_NAME, "words")), |
| Collections.singleton("pos: 1".getBytes("UTF-8")));//just match the first "word" occurrence |
| IndexSearcher searcher = newSearcher(reader); |
| - Scorer scorer = new QueryScorer(query, searcher.getIndexReader(), FIELD_NAME); |
| + QueryScorer scorer = new QueryScorer(query, searcher.getIndexReader(), FIELD_NAME); |
| + scorer.setUsePayloads(true); |
| Highlighter h = new Highlighter(scorer); |
| |
| TopDocs hits = searcher.search(query, null, 10); |
| assertEquals(1, hits.scoreDocs.length); |
| TokenStream stream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), 0, FIELD_NAME, analyzer); |
| + if (random().nextBoolean()) { |
| + stream.reset(); |
| + stream = new CachingTokenFilter(stream);//conceals detection of TokenStreamFromTermVector |
| + } |
| String result = h.getBestFragment(stream, text); |
| assertEquals("random <B>words</B> and words", result);//only highlight first "word" |
| } |
| Index: lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndexAgainstRAMDir.java |
| IDEA additional info: |
| Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP |
| <+>UTF-8 |
| =================================================================== |
| --- lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndexAgainstRAMDir.java (revision 1645984) |
| +++ lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndexAgainstRAMDir.java (revision ) |
| @@ -68,8 +68,8 @@ |
| import org.apache.lucene.search.spans.SpanQuery; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.store.RAMDirectory; |
| -import org.apache.lucene.util.ByteBlockPool.Allocator; |
| import org.apache.lucene.util.ByteBlockPool; |
| +import org.apache.lucene.util.ByteBlockPool.Allocator; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.IOUtils; |
| import org.apache.lucene.util.LineFileDocs; |
| @@ -116,7 +116,7 @@ |
| * runs random tests, up to ITERATIONS times. |
| */ |
| public void testRandomQueries() throws Exception { |
| - MemoryIndex index = new MemoryIndex(random().nextBoolean(), random().nextInt(50) * 1024 * 1024); |
| + MemoryIndex index = randomMemoryIndex(); |
| for (int i = 0; i < ITERATIONS; i++) { |
| assertAgainstRAMDirectory(index); |
| } |
| @@ -148,7 +148,8 @@ |
| Directory ramdir = new RAMDirectory(); |
| Analyzer analyzer = randomAnalyzer(); |
| IndexWriter writer = new IndexWriter(ramdir, |
| - new IndexWriterConfig(analyzer).setCodec(TestUtil.alwaysPostingsFormat(TestUtil.getDefaultPostingsFormat()))); |
| + new IndexWriterConfig(analyzer).setCodec( |
| + TestUtil.alwaysPostingsFormat(TestUtil.getDefaultPostingsFormat()))); |
| Document doc = new Document(); |
| Field field1 = newTextField("foo", fooField.toString(), Field.Store.NO); |
| Field field2 = newTextField("term", termField.toString(), Field.Store.NO); |
| @@ -209,7 +210,11 @@ |
| assertEquals(iwDocsAndPos.startOffset(), memDocsAndPos.startOffset()); |
| assertEquals(iwDocsAndPos.endOffset(), memDocsAndPos.endOffset()); |
| } |
| + |
| + if (iwTerms.hasPayloads()) { |
| + assertEquals(iwDocsAndPos.getPayload(), memDocsAndPos.getPayload()); |
| - } |
| + } |
| + } |
| |
| } |
| |
| @@ -311,7 +316,7 @@ |
| |
| public void testDocsEnumStart() throws Exception { |
| Analyzer analyzer = new MockAnalyzer(random()); |
| - MemoryIndex memory = new MemoryIndex(random().nextBoolean(), random().nextInt(50) * 1024 * 1024); |
| + MemoryIndex memory = new MemoryIndex(random().nextBoolean(), false, random().nextInt(50) * 1024 * 1024); |
| memory.addField("foo", "bar", analyzer); |
| LeafReader reader = (LeafReader) memory.createSearcher().getIndexReader(); |
| DocsEnum disi = TestUtil.docs(random(), reader, "foo", new BytesRef("bar"), null, null, DocsEnum.FLAG_NONE); |
| @@ -336,11 +341,15 @@ |
| return new ByteBlockPool.DirectAllocator(); |
| } |
| } |
| - |
| + |
| + private MemoryIndex randomMemoryIndex() { |
| + return new MemoryIndex(random().nextBoolean(), random().nextBoolean(), random().nextInt(50) * 1024 * 1024); |
| + } |
| + |
| public void testDocsAndPositionsEnumStart() throws Exception { |
| Analyzer analyzer = new MockAnalyzer(random()); |
| int numIters = atLeast(3); |
| - MemoryIndex memory = new MemoryIndex(true, random().nextInt(50) * 1024 * 1024); |
| + MemoryIndex memory = new MemoryIndex(true, false, random().nextInt(50) * 1024 * 1024); |
| for (int i = 0; i < numIters; i++) { // check reuse |
| memory.addField("foo", "bar", analyzer); |
| LeafReader reader = (LeafReader) memory.createSearcher().getIndexReader(); |
| @@ -370,7 +379,7 @@ |
| RegexpQuery regex = new RegexpQuery(new Term("field", "worl.")); |
| SpanQuery wrappedquery = new SpanMultiTermQueryWrapper<>(regex); |
| |
| - MemoryIndex mindex = new MemoryIndex(random().nextBoolean(), random().nextInt(50) * 1024 * 1024); |
| + MemoryIndex mindex = randomMemoryIndex(); |
| mindex.addField("field", new MockAnalyzer(random()).tokenStream("field", "hello there")); |
| |
| // This throws an NPE |
| @@ -382,7 +391,7 @@ |
| RegexpQuery regex = new RegexpQuery(new Term("field", "worl.")); |
| SpanQuery wrappedquery = new SpanOrQuery(new SpanMultiTermQueryWrapper<>(regex)); |
| |
| - MemoryIndex mindex = new MemoryIndex(random().nextBoolean(), random().nextInt(50) * 1024 * 1024); |
| + MemoryIndex mindex = randomMemoryIndex(); |
| mindex.addField("field", new MockAnalyzer(random()).tokenStream("field", "hello there")); |
| |
| // This passes though |
| @@ -390,7 +399,7 @@ |
| } |
| |
| public void testSameFieldAddedMultipleTimes() throws IOException { |
| - MemoryIndex mindex = new MemoryIndex(random().nextBoolean(), random().nextInt(50) * 1024 * 1024); |
| + MemoryIndex mindex = randomMemoryIndex(); |
| MockAnalyzer mockAnalyzer = new MockAnalyzer(random()); |
| mindex.addField("field", "the quick brown fox", mockAnalyzer); |
| mindex.addField("field", "jumps over the", mockAnalyzer); |
| @@ -409,8 +418,8 @@ |
| assertTrue("posGap" + mockAnalyzer.getPositionIncrementGap("field") , mindex.search(query) > 0.0001); |
| } |
| |
| - public void testNonExistingsField() throws IOException { |
| - MemoryIndex mindex = new MemoryIndex(random().nextBoolean(), random().nextInt(50) * 1024 * 1024); |
| + public void testNonExistentField() throws IOException { |
| + MemoryIndex mindex = randomMemoryIndex(); |
| MockAnalyzer mockAnalyzer = new MockAnalyzer(random()); |
| mindex.addField("field", "the quick brown fox", mockAnalyzer); |
| LeafReader reader = (LeafReader) mindex.createSearcher().getIndexReader(); |
| @@ -420,11 +429,11 @@ |
| assertNull(reader.termPositionsEnum(new Term("not-in-index", "foo"))); |
| assertNull(reader.terms("not-in-index")); |
| } |
| - |
| + |
| public void testDuellMemIndex() throws IOException { |
| LineFileDocs lineFileDocs = new LineFileDocs(random()); |
| int numDocs = atLeast(10); |
| - MemoryIndex memory = new MemoryIndex(random().nextBoolean(), random().nextInt(50) * 1024 * 1024); |
| + MemoryIndex memory = randomMemoryIndex(); |
| for (int i = 0; i < numDocs; i++) { |
| Directory dir = newDirectory(); |
| MockAnalyzer mockAnalyzer = new MockAnalyzer(random()); |
| @@ -535,7 +544,7 @@ |
| assertThat("Position test failed" + failDesc, memPos, equalTo(pos)); |
| assertThat("Start offset test failed" + failDesc, memDocsPosEnum.startOffset(), equalTo(docsPosEnum.startOffset())); |
| assertThat("End offset test failed" + failDesc, memDocsPosEnum.endOffset(), equalTo(docsPosEnum.endOffset())); |
| - assertThat("Missing payload test failed" + failDesc, docsPosEnum.getPayload(), equalTo(null)); |
| + assertThat("Missing payload test failed" + failDesc, docsPosEnum.getPayload(), equalTo(docsPosEnum.getPayload())); |
| } |
| } |
| assertNull("Still some tokens not processed", memTermEnum.next()); |
| Index: lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java |
| IDEA additional info: |
| Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP |
| <+>UTF-8 |
| =================================================================== |
| --- lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java (revision 1645984) |
| +++ lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java (revision ) |
| @@ -21,18 +21,24 @@ |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.PackedTokenAttributeImpl; |
| import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| import org.apache.lucene.index.DocsAndPositionsEnum; |
| import org.apache.lucene.index.Terms; |
| import org.apache.lucene.index.TermsEnum; |
| +import org.apache.lucene.util.AttributeFactory; |
| import org.apache.lucene.util.BytesRef; |
| +import org.apache.lucene.util.BytesRefArray; |
| +import org.apache.lucene.util.BytesRefBuilder; |
| +import org.apache.lucene.util.Counter; |
| import org.apache.lucene.util.UnicodeUtil; |
| |
| /** |
| * TokenStream created from a term vector field. The term vector requires positions and/or offsets (either). If you |
| * want payloads add PayloadAttributeImpl (as you would normally) but don't assume the attribute is already added just |
| - * because you know the term vector has payloads. This TokenStream supports an efficient {@link #reset()}, so there's |
| + * because you know the term vector has payloads, since the first call to incrementToken() will observe if you asked |
| + * for them and if not then won't get them. This TokenStream supports an efficient {@link #reset()}, so there's |
| * no need to wrap with a caching impl. |
| * <p /> |
| * The implementation will create an array of tokens indexed by token position. As long as there aren't massive jumps |
| @@ -47,6 +53,11 @@ |
| |
| //TODO add a maxStartOffset filter, which highlighters will find handy |
| |
| + //This attribute factory uses less memory when captureState() is called. |
| + public static final AttributeFactory ATTRIBUTE_FACTORY = |
| + AttributeFactory.getStaticImplementation( |
| + AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, PackedTokenAttributeImpl.class); |
| + |
| private final Terms vector; |
| |
| private final CharTermAttribute termAttribute; |
| @@ -56,11 +67,15 @@ |
| private OffsetAttribute offsetAttribute;//maybe null |
| |
| private PayloadAttribute payloadAttribute;//maybe null |
| + private BytesRefArray payloadsBytesRefArray;//only used when payloadAttribute is non-null |
| + private BytesRefBuilder spareBytesRefBuilder;//only used when payloadAttribute is non-null |
| |
| private TokenLL firstToken = null; // the head of a linked-list |
| |
| private TokenLL incrementToken = null; |
| |
| + private boolean initialized = false;//lazy |
| + |
| /** |
| * Constructor. |
| * |
| @@ -68,6 +83,8 @@ |
| * creating the TokenStream. Must have positions and/or offsets. |
| */ |
| public TokenStreamFromTermVector(Terms vector) throws IOException { |
| + super(ATTRIBUTE_FACTORY); |
| + assert !hasAttribute(PayloadAttribute.class) : "AttributeFactory shouldn't have payloads *yet*"; |
| if (!vector.hasPositions() && !vector.hasOffsets()) { |
| throw new IllegalArgumentException("The term vector needs positions and/or offsets."); |
| } |
| @@ -81,20 +98,20 @@ |
| |
| @Override |
| public void reset() throws IOException { |
| - if (firstToken == null) {//just the first time |
| - init(); |
| - } |
| incrementToken = null; |
| super.reset(); |
| } |
| |
| - //We initialize in reset() because we can see which attributes the consumer wants, particularly payloads |
| + //We delay initialization because we can see which attributes the consumer wants, particularly payloads |
| private void init() throws IOException { |
| + assert !initialized; |
| if (vector.hasOffsets()) { |
| offsetAttribute = addAttribute(OffsetAttribute.class); |
| } |
| if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) { |
| payloadAttribute = getAttribute(PayloadAttribute.class); |
| + payloadsBytesRefArray = new BytesRefArray(Counter.newCounter()); |
| + spareBytesRefBuilder = new BytesRefBuilder(); |
| } |
| |
| // Step 1: iterate termsEnum and create a token, placing into an array of tokens by position |
| @@ -132,14 +149,9 @@ |
| } |
| |
| if (payloadAttribute != null) { |
| - // Must make a deep copy of the returned payload, |
| - // since D&PEnum API is allowed to re-use on every |
| - // call: |
| final BytesRef payload = dpEnum.getPayload(); |
| - if (payload != null) { |
| - token.payload = BytesRef.deepCopyOf(payload);//TODO share a ByteBlockPool & re-use BytesRef |
| + token.payloadIndex = payload == null ? -1 : payloadsBytesRefArray.append(payload); |
| - } |
| + } |
| - } |
| |
| //Add token to an array indexed by position |
| if (positionedTokens.length <= pos) { |
| @@ -198,6 +210,8 @@ |
| prevTokenPos = pos; |
| prevToken = token; |
| } |
| + |
| + initialized = true; |
| } |
| |
| private TokenLL[] initTokensArray() throws IOException { |
| @@ -216,8 +230,12 @@ |
| } |
| |
| @Override |
| - public boolean incrementToken() { |
| + public boolean incrementToken() throws IOException { |
| if (incrementToken == null) { |
| + if (!initialized) { |
| + init(); |
| + assert initialized; |
| + } |
| incrementToken = firstToken; |
| if (incrementToken == null) { |
| return false; |
| @@ -234,8 +252,12 @@ |
| offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.endOffset); |
| } |
| if (payloadAttribute != null) { |
| - payloadAttribute.setPayload(incrementToken.payload); |
| + if (incrementToken.payloadIndex == -1) { |
| + payloadAttribute.setPayload(null); |
| + } else { |
| + payloadAttribute.setPayload(payloadsBytesRefArray.get(spareBytesRefBuilder, incrementToken.payloadIndex)); |
| - } |
| + } |
| + } |
| return true; |
| } |
| |
| @@ -245,7 +267,7 @@ |
| int positionIncrement; |
| int startOffset; |
| int endOffset; |
| - BytesRef payload; |
| + int payloadIndex; |
| |
| TokenLL next; |
| |
| Index: lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java |
| IDEA additional info: |
| Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP |
| <+>UTF-8 |
| =================================================================== |
| --- lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java (revision 1645984) |
| +++ lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java (revision ) |
| @@ -83,9 +83,9 @@ |
| private boolean cachedTokenStream; |
| private boolean wrapToCaching = true; |
| private int maxDocCharsToAnalyze; |
| + private boolean usePayloads = false; |
| private LeafReader internalReader = null; |
| |
| - |
| public WeightedSpanTermExtractor() { |
| } |
| |
| @@ -384,7 +384,7 @@ |
| |
| // Use MemoryIndex (index/invert this tokenStream now) |
| if (internalReader == null) { |
| - final MemoryIndex indexer = new MemoryIndex(true); |
| + final MemoryIndex indexer = new MemoryIndex(true, usePayloads);//offsets and payloads |
| if (cacheIt) { |
| assert !cachedTokenStream; |
| tokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze)); |
| @@ -652,8 +652,16 @@ |
| |
| public void setExpandMultiTermQuery(boolean expandMultiTermQuery) { |
| this.expandMultiTermQuery = expandMultiTermQuery; |
| + } |
| + |
| + public boolean isUsePayloads() { |
| + return usePayloads; |
| + } |
| + |
| + public void setUsePayloads(boolean usePayloads) { |
| + this.usePayloads = usePayloads; |
| } |
| - |
| + |
| public boolean isCachedTokenStream() { |
| return cachedTokenStream; |
| } |
| Index: lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java |
| IDEA additional info: |
| Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP |
| <+>UTF-8 |
| =================================================================== |
| --- lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java (revision 1645984) |
| +++ lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java (revision ) |
| @@ -54,6 +54,7 @@ |
| private boolean skipInitExtractor; |
| private boolean wrapToCaching = true; |
| private int maxCharsToAnalyze; |
| + private boolean usePayloads = false; |
| |
| /** |
| * @param query Query to use for highlighting |
| @@ -213,6 +214,7 @@ |
| qse.setMaxDocCharsToAnalyze(maxCharsToAnalyze); |
| qse.setExpandMultiTermQuery(expandMultiTermQuery); |
| qse.setWrapIfNotCachingTokenFilter(wrapToCaching); |
| + qse.setUsePayloads(usePayloads); |
| if (reader == null) { |
| this.fieldWeightedSpanTerms = qse.getWeightedSpanTerms(query, |
| tokenStream, field); |
| @@ -258,8 +260,20 @@ |
| */ |
| public void setExpandMultiTermQuery(boolean expandMultiTermQuery) { |
| this.expandMultiTermQuery = expandMultiTermQuery; |
| + } |
| + |
| + /** |
| + * Whether or not we should capture payloads in {@link MemoryIndex} at each position so that queries can access them. |
| + * This does not apply to term vector based TokenStreams, which support payloads only when the term vector has them. |
| + */ |
| + public boolean isUsePayloads() { |
| + return usePayloads; |
| + } |
| + |
| + public void setUsePayloads(boolean usePayloads) { |
| + this.usePayloads = usePayloads; |
| } |
| - |
| + |
| /** |
| * By default, {@link TokenStream}s that are not of the type |
| * {@link CachingTokenFilter} are wrapped in a {@link CachingTokenFilter} to |
| Index: lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java |
| IDEA additional info: |
| Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP |
| <+>UTF-8 |
| =================================================================== |
| --- lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java (revision 1645984) |
| +++ lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java (revision ) |
| @@ -29,6 +29,7 @@ |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; |
| import org.apache.lucene.index.BinaryDocValues; |
| @@ -60,6 +61,8 @@ |
| import org.apache.lucene.util.Bits; |
| import org.apache.lucene.util.ByteBlockPool; |
| import org.apache.lucene.util.BytesRef; |
| +import org.apache.lucene.util.BytesRefArray; |
| +import org.apache.lucene.util.BytesRefBuilder; |
| import org.apache.lucene.util.BytesRefHash; |
| import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray; |
| import org.apache.lucene.util.Counter; |
| @@ -187,17 +190,19 @@ |
| */ |
| public class MemoryIndex { |
| |
| + private static final boolean DEBUG = false; |
| + |
| /** info for each field: Map<String fieldName, Info field> */ |
| private final SortedMap<String,Info> fields = new TreeMap<>(); |
| |
| private final boolean storeOffsets; |
| + private final boolean storePayloads; |
| - |
| + |
| - private static final boolean DEBUG = false; |
| - |
| private final ByteBlockPool byteBlockPool; |
| private final IntBlockPool intBlockPool; |
| // private final IntBlockPool.SliceReader postingsReader; |
| private final IntBlockPool.SliceWriter postingsWriter; |
| + private final BytesRefArray payloadsBytesRefs;//non null only when storePayloads |
| |
| private Counter bytesUsed; |
| |
| @@ -206,7 +211,7 @@ |
| private Similarity normSimilarity = IndexSearcher.getDefaultSimilarity(); |
| |
| /** |
| - * Constructs an empty instance. |
| + * Constructs an empty instance that will not store offsets or payloads. |
| */ |
| public MemoryIndex() { |
| this(false); |
| @@ -215,25 +220,36 @@ |
| /** |
| * Constructs an empty instance that can optionally store the start and end |
| * character offset of each token term in the text. This can be useful for |
| - * highlighting of hit locations with the Lucene highlighter package. |
| - * Protected until the highlighter package matures, so that this can actually |
| - * be meaningfully integrated. |
| + * highlighting of hit locations with the Lucene highlighter package. But |
| + * it will not store payloads; use another constructor for that. |
| * |
| * @param storeOffsets |
| * whether or not to store the start and end character offset of |
| * each token term in the text |
| */ |
| public MemoryIndex(boolean storeOffsets) { |
| - this(storeOffsets, 0); |
| + this(storeOffsets, false); |
| } |
| - |
| + |
| /** |
| + * Constructs an empty instance with the option of storing offsets and payloads. |
| + * |
| + * @param storeOffsets store term offsets at each position |
| + * @param storePayloads store term payloads at each position |
| + */ |
| + public MemoryIndex(boolean storeOffsets, boolean storePayloads) { |
| + this(storeOffsets, storePayloads, 0); |
| + } |
| + |
| + /** |
| * Expert: This constructor accepts an upper limit for the number of bytes that should be reused if this instance is {@link #reset()}. |
| * @param storeOffsets <code>true</code> if offsets should be stored |
| + * @param storePayloads <code>true</code> if payloads should be stored |
| * @param maxReusedBytes the number of bytes that should remain in the internal memory pools after {@link #reset()} is called |
| */ |
| - MemoryIndex(boolean storeOffsets, long maxReusedBytes) { |
| + MemoryIndex(boolean storeOffsets, boolean storePayloads, long maxReusedBytes) { |
| this.storeOffsets = storeOffsets; |
| + this.storePayloads = storePayloads; |
| this.bytesUsed = Counter.newCounter(); |
| final int maxBufferedByteBlocks = (int)((maxReusedBytes/2) / ByteBlockPool.BYTE_BLOCK_SIZE ); |
| final int maxBufferedIntBlocks = (int) ((maxReusedBytes - (maxBufferedByteBlocks*ByteBlockPool.BYTE_BLOCK_SIZE))/(IntBlockPool.INT_BLOCK_SIZE * RamUsageEstimator.NUM_BYTES_INT)); |
| @@ -241,6 +257,7 @@ |
| byteBlockPool = new ByteBlockPool(new RecyclingByteBlockAllocator(ByteBlockPool.BYTE_BLOCK_SIZE, maxBufferedByteBlocks, bytesUsed)); |
| intBlockPool = new IntBlockPool(new RecyclingIntBlockAllocator(IntBlockPool.INT_BLOCK_SIZE, maxBufferedIntBlocks, bytesUsed)); |
| postingsWriter = new SliceWriter(intBlockPool); |
| + payloadsBytesRefs = storePayloads ? new BytesRefArray(bytesUsed) : null; |
| } |
| |
| /** |
| @@ -381,8 +398,8 @@ |
| * |
| * @param fieldName |
| * a name to be associated with the text |
| - * @param stream |
| - * the token stream to retrieve tokens from. |
| + * @param tokenStream |
| + * the token stream to retrieve tokens from. It's guaranteed to be closed no matter what. |
| * @param boost |
| * the boost factor for hits for this field |
| * @param positionIncrementGap |
| @@ -391,16 +408,17 @@ |
| * the offset gap if fields with the same name are added more than once |
| * @see org.apache.lucene.document.Field#setBoost(float) |
| */ |
| - public void addField(String fieldName, TokenStream stream, float boost, int positionIncrementGap, int offsetGap) { |
| - try { |
| + public void addField(String fieldName, TokenStream tokenStream, float boost, int positionIncrementGap, |
| + int offsetGap) { |
| + try (TokenStream stream = tokenStream) { |
| if (frozen) |
| throw new IllegalArgumentException("Cannot call addField() when MemoryIndex is frozen"); |
| if (fieldName == null) |
| throw new IllegalArgumentException("fieldName must not be null"); |
| if (stream == null) |
| - throw new IllegalArgumentException("token stream must not be null"); |
| + throw new IllegalArgumentException("token stream must not be null"); |
| if (boost <= 0.0f) |
| - throw new IllegalArgumentException("boost factor must be greater than 0.0"); |
| + throw new IllegalArgumentException("boost factor must be greater than 0.0"); |
| int numTokens = 0; |
| int numOverlapTokens = 0; |
| int pos = -1; |
| @@ -421,8 +439,9 @@ |
| sliceArray = info.sliceArray; |
| sumTotalTermFreq = info.sumTotalTermFreq; |
| } else { |
| - fieldInfo = new FieldInfo(fieldName, fields.size(), false, false, false, |
| - this.storeOffsets ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, |
| + fieldInfo = new FieldInfo(fieldName, fields.size(), false, false, this.storePayloads, |
| + this.storeOffsets |
| + ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, |
| DocValuesType.NONE, -1, null); |
| sliceArray = new SliceByteStartArray(BytesRefHash.DEFAULT_CAPACITY); |
| terms = new BytesRefHash(byteBlockPool, BytesRefHash.DEFAULT_CAPACITY, sliceArray); |
| @@ -431,6 +450,7 @@ |
| TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); |
| PositionIncrementAttribute posIncrAttribute = stream.addAttribute(PositionIncrementAttribute.class); |
| OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class); |
| + PayloadAttribute payloadAtt = storePayloads ? stream.addAttribute(PayloadAttribute.class) : null; |
| BytesRef ref = termAtt.getBytesRef(); |
| stream.reset(); |
| |
| @@ -451,13 +471,16 @@ |
| } |
| sliceArray.freq[ord]++; |
| sumTotalTermFreq++; |
| - if (!storeOffsets) { |
| - postingsWriter.writeInt(pos); |
| + postingsWriter.writeInt(pos); |
| - } else { |
| - postingsWriter.writeInt(pos); |
| + if (storeOffsets) { |
| postingsWriter.writeInt(offsetAtt.startOffset() + offset); |
| postingsWriter.writeInt(offsetAtt.endOffset() + offset); |
| } |
| + if (storePayloads) { |
| + final BytesRef payload = payloadAtt.getPayload(); |
| + int pIndex = payload == null ? -1 : payloadsBytesRefs.append(payload); |
| + postingsWriter.writeInt(pIndex); |
| + } |
| sliceArray.end[ord] = postingsWriter.getCurrentOffset(); |
| } |
| stream.end(); |
| @@ -466,18 +489,10 @@ |
| if (numTokens > 0) { |
| fields.put(fieldName, new Info(fieldInfo, terms, sliceArray, numTokens, numOverlapTokens, boost, pos, offsetAtt.endOffset() + offset, sumTotalTermFreq)); |
| } |
| - } catch (Exception e) { // can never happen |
| + } catch (IOException e) { |
| throw new RuntimeException(e); |
| - } finally { |
| - try { |
| - if (stream != null) { |
| - stream.close(); |
| - } |
| + } |
| - } catch (IOException e2) { |
| - throw new RuntimeException(e2); |
| - } |
| + } |
| - } |
| - } |
| |
| /** |
| * Set the Similarity to be used for calculating field norms |
| @@ -861,7 +876,7 @@ |
| |
| @Override |
| public boolean hasPayloads() { |
| - return false; |
| + return storePayloads; |
| } |
| }; |
| } |
| @@ -1043,17 +1058,20 @@ |
| } |
| |
| private class MemoryDocsAndPositionsEnum extends DocsAndPositionsEnum { |
| + private final SliceReader sliceReader; |
| private int posUpto; // for assert |
| private boolean hasNext; |
| private Bits liveDocs; |
| private int doc = -1; |
| - private SliceReader sliceReader; |
| private int freq; |
| private int startOffset; |
| private int endOffset; |
| + private int payloadIndex; |
| + private final BytesRefBuilder payloadBuilder;//only non-null when storePayloads |
| - |
| + |
| public MemoryDocsAndPositionsEnum() { |
| this.sliceReader = new SliceReader(intBlockPool); |
| + this.payloadBuilder = storePayloads ? new BytesRefBuilder() : null; |
| } |
| |
| public DocsAndPositionsEnum reset(Bits liveDocs, int start, int end, int freq) { |
| @@ -1096,15 +1114,16 @@ |
| public int nextPosition() { |
| assert posUpto++ < freq; |
| assert !sliceReader.endOfSlice() : " stores offsets : " + startOffset; |
| - if (storeOffsets) { |
| - int pos = sliceReader.readInt(); |
| + int pos = sliceReader.readInt(); |
| + if (storeOffsets) { |
| startOffset = sliceReader.readInt(); |
| endOffset = sliceReader.readInt(); |
| - return pos; |
| - } else { |
| - return sliceReader.readInt(); |
| } |
| + if (storePayloads) { |
| + payloadIndex = sliceReader.readInt(); |
| - } |
| + } |
| + return pos; |
| + } |
| |
| @Override |
| public int startOffset() { |
| @@ -1118,8 +1137,11 @@ |
| |
| @Override |
| public BytesRef getPayload() { |
| + if (payloadBuilder == null || payloadIndex == -1) { |
| - return null; |
| - } |
| + return null; |
| + } |
| + return payloadsBytesRefs.get(payloadBuilder, payloadIndex); |
| + } |
| |
| @Override |
| public long cost() { |
| @@ -1178,6 +1200,9 @@ |
| this.normSimilarity = IndexSearcher.getDefaultSimilarity(); |
| byteBlockPool.reset(false, false); // no need to 0-fill the buffers |
| intBlockPool.reset(true, false); // here must must 0-fill since we use slices |
| + if (payloadsBytesRefs != null) { |
| + payloadsBytesRefs.clear(); |
| + } |
| this.frozen = false; |
| } |
| |